Global_fishing_watch_workflow

Author

Théophile L. Mouton

Published

September 3, 2024

Combining AIS and Sentinel-1 SAR fishing effort data from Global Fishing Watch

Open R libraries

Code
# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(gridExtra)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)
library(knitr)
library(kableExtra)

AIS data

Data from Kroodsma et al. (2018) Science, accessible at: Global Fishing Watch Data Download Portal.

The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.

Code
# Set the path to the 2017-2020 folder

#path <- "Data/AIS Fishing Effort 2017-2020"

# List all CSV files in the folder
#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)

# Read all CSV files and combine them into a single data frame
#AIS_fishing <- AIS_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","AIS_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
  group_by(cell_ll_lat, cell_ll_lon) %>%
  summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  # Remove any cells with zero or negative fishing hours
  filter(total_fishing_hours > 0)

# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
  list(
    lon_std = floor(lon * 10) / 10,
    lat_std = floor(lat * 10) / 10
  )
}

# Standardize and aggregate AIS data
AIS_data_std <- aggregated_AIS_fishing %>%
  mutate(coords = map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = AIS_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Sentinel-1 SAR data

Data from Paolo et al. 2024 Nature, accessible at: Global Fishing Watch SAR Vessel Detections

The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.

Code
# Set the path to the 2016 folder
#path <- "Data/SAR Vessel detections 2017-2020"

# List all CSV files in the folder
#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)

# Read all CSV files and combine them into a single data frame
#SAR_fishing <- SAR_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","SAR_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
  mutate(
    lat_rounded = round(lat, digits = 2),
    lon_rounded = round(lon, digits = 2)
  ) %>%
  group_by(lat_rounded, lon_rounded) %>%
  filter(fishing_score >= 0.9) %>%
  summarise(
    total_presence_score = sum(presence_score, na.rm = TRUE),
    avg_fishing_score = mean(fishing_score, na.rm = TRUE),
    count = n()
  ) %>%
  mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
  ungroup()

# Standardize and aggregate SAR data
SAR_data_std <- aggregated_SAR_fishing %>%
  mutate(coords = map2(lon_rounded, lat_rounded, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = SAR_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_presence_score)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Fishing vessel detections (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Compare AIS, and SAR detection locations

Identifying grid cells with only AIS, only SAR detections or both data types.

Code
# Merge the datasets
combined_data <- full_join(
  AIS_data_std,
  SAR_data_std,
  by = c("lon_std", "lat_std")
)

# Categorize each cell
combined_data <- combined_data %>%
  mutate(category = case_when(
    total_fishing_hours > 0 & total_presence_score > 0 ~ "Both AIS and SAR",
    total_fishing_hours > 0 & (is.na(total_presence_score) | total_presence_score == 0) ~ "Only AIS",
    (is.na(total_fishing_hours) | total_fishing_hours == 0) & total_presence_score > 0 ~ "Only SAR",
    TRUE ~ "No fishing detected"
  ))

# Create the world map
world_map <- map_data("world")

# Create the plot
world_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data, 
            aes(x = lon_std, y = lat_std, fill = category)) +
  scale_fill_manual(
    values = c("Both AIS and SAR" = "purple", 
               "Only AIS" = "blue", 
               "Only SAR" = "red", 
               "No fishing detected" = "white"),
    name = "Fishing Data Source",
    guide = guide_legend(title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Detection",
       subtitle = "Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Display the plot
print(world_plot)

Code
# Get summary statistics
summary_stats <- combined_data %>% 
  count(category) %>% 
  mutate(percentage = n / sum(n) * 100) %>%
  rename(`Number of cells` = n) %>%
  mutate(percentage = round(percentage, 2))

# Create and display the table
kable(summary_stats, 
      format = "html", 
      col.names = c("Category", "Number of cells", "Percentage (%)"),
      caption = "Summary Statistics of Data Categories") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, 
                position = "center") %>%
  column_spec(2, width = "150px") %>%
  column_spec(3, width = "150px")
Summary Statistics of Data Categories
Category Number of cells Percentage (%)
Both AIS and SAR 163095 9.12
Only AIS 1566190 87.60
Only SAR 58668 3.28

Random forest model

Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally.

Code
#---- Open the AIS data 

#load(here::here("Data", "AIS_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
  group_by(cell_ll_lat, cell_ll_lon) %>%
  summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  # Remove any cells with zero or negative fishing hours
  filter(total_fishing_hours > 0)

#---- Open the SAR data 

#load(here::here("Data", "SAR_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
  mutate(
    lat_rounded = round(lat, digits = 2),
    lon_rounded = round(lon, digits = 2)
  ) %>%
  #  filter(matched_category == "fishing") %>%
  group_by(lat_rounded, lon_rounded) %>%
  filter(fishing_score >= 0.9) %>%
  summarise(
    total_presence_score = sum(presence_score, na.rm = TRUE),
    avg_fishing_score = mean(fishing_score, na.rm = TRUE),
    count = n()
  ) %>%
  mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
  ungroup()


# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
  list(
    lon_std = floor(lon * 10) / 10,
    lat_std = floor(lat * 10) / 10
  )
}

# Aggregate AIS data to 0.1 degree resolution
AIS_data_01deg <- aggregated_AIS_fishing %>%
  mutate(coords = purrr::map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
  mutate(
    lon_std = sapply(coords, function(x) x$lon_std),
    lat_std = sapply(coords, function(x) x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")

# Aggregate SAR data to 0.1 degree resolution
SAR_data_01deg <- aggregated_SAR_fishing %>%
  mutate(coords = purrr::map2(lon_rounded, lat_rounded, standardize_coords)) %>%
  mutate(
    lon_std = sapply(coords, function(x) x$lon_std),
    lat_std = sapply(coords, function(x) x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")

# Merge the datasets
combined_data_01deg <- full_join(
  AIS_data_01deg,
  SAR_data_01deg,
  by = c("lon_std", "lat_std")
) %>%
  mutate(
    has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
    has_SAR = !is.na(total_presence_score) & total_presence_score > 0
  )

# Separate the data into training (both AIS and SAR) and prediction (only SAR) sets
training_data <- combined_data_01deg %>%
  filter(has_AIS & has_SAR) %>%
  select(total_fishing_hours, total_presence_score, lon_std, lat_std)

prediction_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  select(total_presence_score, lon_std, lat_std)

# Train the random forest model with timing
#set.seed(123)  # for reproducibility
#rf_timing <- system.time({
#  rf_model_01deg <- randomForest(
#    total_fishing_hours ~ total_presence_score + lon_std + lat_std,
#    data = training_data,
#    ntree = 500,
#    importance = TRUE
#  )
#})

# Print the time taken
#print(paste("Random Forest model training time:", rf_timing["elapsed"], "seconds"))

#save(rf_model_01deg,file="rf_model_01deg.Rdata")
load(here::here("rf_model_01deg.Rdata"))

#Visualise results 
# Make predictions
predictions <- predict(rf_model, newdata = prediction_data)

# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
  mutate(
    predicted_fishing_hours = case_when(
      has_AIS ~ total_fishing_hours,
      has_SAR ~ predict(rf_model, newdata = select(., total_presence_score, lon_std, lat_std)),
      TRUE ~ 0
    )
  )

# Create the world map
world_map <- map_data("world")

#Map of predicted fishing hours only 
# Prepare the data for the map
map_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  select(lon_std, lat_std, predicted_fishing_hours)

# Create the map
predicted_SAR_only_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "darkgray", fill = "lightgray", size = 0.1) +
  geom_tile(data = map_data, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Predicted fishing hours (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Predicted Fishing Hours in Areas with Only SAR Detections",
       subtitle = "0.1 degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Print the map
print(predicted_SAR_only_plot)

Code
#Map of both original and predicted AIS fishing hours 
# Visualize the results
predicted_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data_01deg, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (0.1 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

print(predicted_plot)

Code
#Aggregate data to 1 degree resolution 
# First, round the coordinates to the nearest degree
combined_data_1deg <- combined_data_01deg %>%
  mutate(
    lon_1deg = round(lon_std),
    lat_1deg = round(lat_std)
  )

# Aggregate the data
aggregated_data <- combined_data_1deg %>%
  group_by(lon_1deg, lat_1deg) %>%
  summarise(
    predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
    .groups = 'drop'
  )

# Create the world map
world_map <- map_data("world")

# Create the map
predicted_plot_1deg <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = aggregated_data, 
            aes(x = lon_1deg, y = lat_1deg, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (1 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data made at 0.1 degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Print the map
print(predicted_plot_1deg)

Code
# Evaluate the model
# Function to evaluate models
evaluate_model <- function(model, data, log_target = FALSE) {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- 10^predictions - 1
  }
  
  actual <- data$total_fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared and Adjusted R-squared
  ss_total <- sum((actual - mean(actual))^2, na.rm = TRUE)
  ss_residual <- sum((actual - predictions)^2, na.rm = TRUE)
  r_squared <- 1 - (ss_residual / ss_total)
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,          # Mean Absolute Error
    "Root Mean Squared Error" = rmse,     # Root Mean Squared Error
    "Mean Absolute Percentage Error" = mape,                          # Mean Absolute Percentage Error
    "Median Absolute Error" = medae,                        # Median Absolute Error
    "R-Squared" = r_squared,                # R-Squared (Coefficient of Determination)
    "Adjusted R-Squared" = adj_r_squared,   # Adjusted R-Squared
    "Mean of Residuals" = mean_residual,        # Mean of Residuals
    "Standard Deviation of Residuals" = sd_residual,            # Standard Deviation of Residuals
    "Feature Importance" = feature_importance # Feature Importance 
  ))
}

# Merge the datasets
combined_data_01deg <- full_join(
  AIS_data_01deg,
  SAR_data_01deg,
  by = c("lon_std", "lat_std")
) %>%
  mutate(
    has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
    has_SAR = !is.na(total_presence_score) & total_presence_score > 0,
    data_category = case_when(
      has_AIS & has_SAR ~ "Both AIS and SAR",
      has_AIS & !has_SAR ~ "Only AIS",
      !has_AIS & has_SAR ~ "Only SAR",
      TRUE ~ "No fishing detected"
    )
  )

# Evaluate all models
validation_data <- combined_data_01deg %>% filter(data_category == "Both AIS and SAR")
results_no_transform <- evaluate_model(rf_model, validation_data)

# Create a dataframe for the table
results_table <- data.frame(
  Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error",
             "Median Absolute Error", "R-Squared", "Adjusted R-Squared",
             "Mean of Residuals", "Standard Deviation of Residuals"),
  Value = c(results_no_transform$`Mean Absolute Error`,
            results_no_transform$`Root Mean Squared Error`,
            results_no_transform$`Mean Absolute Percentage Error`,
            results_no_transform$`Median Absolute Error`,
            results_no_transform$`R-Squared`,
            results_no_transform$`Adjusted R-Squared`,
            results_no_transform$`Mean of Residuals`,
            results_no_transform$`Standard Deviation of Residuals`)
)

# Create and display the table
kable(results_table, format = "html", digits = 4, caption = "Model Evaluation Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center")
Model Evaluation Metrics
Metric Value
Mean Absolute Error 275.0480
Root Mean Squared Error 1177.0435
Mean Absolute Percentage Error 1965.4999
Median Absolute Error 55.1906
R-Squared 0.9166
Adjusted R-Squared 0.9166
Mean of Residuals -0.1617
Standard Deviation of Residuals 1177.0471
Code
# For feature importance, create a separate table
feature_importance <- as.data.frame(results_no_transform$`Feature Importance`)
feature_importance$Feature <- rownames(feature_importance)
feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]
colnames(feature_importance) <- c("Feature", "%IncMSE", "IncNodePurity")

# Sort the feature importance table by %IncMSE in descending order
feature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]

kable(feature_importance, format = "html", digits = 4, caption = "Feature Importance") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, width = "150px")
Feature Importance
Feature %IncMSE IncNodePurity
total_presence_score total_presence_score 98.8369 987162725907
lon_std lon_std 43.6550 791743926811
lat_std lat_std 38.6756 696033312224

Model evaluation interpretation:

Strengths: The model explains a large portion of the variance in the data (high R² and Adjusted R²), and the median error (MedAE) is reasonably low, suggesting that the model is accurate for many of the predictions.

Weaknesses: The high RMSE, MAPE, and SD of residuals indicate the presence of some large errors and possible outliers that are skewing the results. The very high MAPE is particularly concerning, suggesting that the model may struggle with predictions for certain instances, possibly due to the nature of the data.

Feature Importance: The model relies heavily on the total_presence_score (total vessel detections) feature, with geographical location features also playing significant roles.

To improve the model, consider investigating the outliers and errors, potentially refining the model or using alternative modeling approaches that might handle these cases better.